get_conus_spatial <- function(data, var, proj = 5070, abb = FALSE) {
# Filter CONUS from a spatial US dataset and modify its projection.
#
# Args:
# data: Spatial US dataset to filter.
# var: Column of "data" that contains states' information.
# proj: EPSG code to project dataset to, defaults to EPSG:5070.
# abb: If TRUE, then states in "var" are denoted with abbreviations;
# If FALSE, then states in "var" are denoted by their full names.
#
# Returns:
# A spatial CONUS dataset projected to given EPSG code.
if (abb) {
conus <- filter(data,
!(get(var)) %in% c("AK", "HI", "PR", "GU"))
} else {
conus <- filter(data,
!(get(var)) %in% c("Alaska", "Hawaii", "Puerto Rico", "Guam"))
}
conus <- sf::st_transform(conus, proj)
return(conus)
}
generate_tiles <- function(data, tile_type = TRUE) {
# Generate tiled spatial data from four tessellation/coverage options.
#
# Args:
# data: Spatial dataset to tesselate
# tile_type: "square" - square coverage with "n = 70"
# "hexagonal" - hexagonal coverage with "n = 70"
# "voroni" - voroni tesselation
# "triangulate" - Delaunay triangulation
# none of the above - unmodified data
#
# Returns:
# A spatial, tiled dataset from a given dataset and
# tessellation/coverage type or unmodified data.
if (tile_type == "square") {
tiled_data <- sf::st_make_grid(data, n = 70)
} else if (tile_type == "hexagonal") {
tiled_data <- sf::st_make_grid(data, n = 70, square = FALSE)
} else if (tile_type == "voroni") {
tiled_data <- sf::st_centroid(data) %>%
sf::st_combine() %>%
sf::st_voronoi() %>%
sf::st_cast()
} else if (tile_type == "triangulate") {
tiled_data <- sf::st_centroid(data) %>%
sf::st_combine() %>%
sf::st_triangulate() %>%
sf::st_cast()
} else {
return(data)
}
tiled_data <- sf::st_as_sf(tiled_data) %>%
mutate(id = 1:n())
return(tiled_data)
}
plot_map <- function(data, title = "", include_features = TRUE) {
# Plot spatial data using ggplot() + geom_sf().
#
# Args:
# data: Spatial dataset to plot.
# title: Title shown on plot.
# include_features: If TRUE, include caption of features collected.
#
# Returns:
# A plotted ggplot() map with, a white background,
# navy border, size = 0.2, features collected, and given title.
ggplot() +
geom_sf(
data = data,
fill = "white",
colour = "navy",
size = 0.2
) +
theme_void() +
labs(
title = title,
caption = if (include_features) {
paste(nrow(data), " features collected")
} else {
""
}
)
}
plot_map_pip <- function(data, title = "") {
# Plot simple features point-in-polygon data using ggplot() + geom_sf().
#
# Args:
# data: simple features point-in-polygon dataset to plot.
# title: Title shown on plot.
#
# Returns:
# A plotted ggplot() map using viridis colour scale.
ggplot() +
geom_sf(
data = data,
aes(fill = n),
size = 0.2,
col = NA
) +
scale_fill_viridis(
option = "plasma",
name = "Number of Dams",
guide = guide_colorbar(
direction = "horizontal",
barheight = unit(2, units = "mm"),
barwidth = unit(50, units = "mm"),
draw.ulim = F,
title.position = "top",
title.hjust = 0.5,
label.hjust = 0.5
)
) +
theme_void() +
theme(legend.position = "bottom") +
labs(
title = title,
caption = paste(sum(data$n), " dams")
)
}
compute_features <- function(sf_data, description) {
# Compute calculations on a "sf" object.
#
# Args:
# sf_data: A simple features object.
# description: A description of the data frame output.
#
# Returns:
# A data frame with "description" as the first column, and following,
# the number of features, mean area of the "sf" object, standard deviation
# of the area, and total area.
features_area <- sf::st_area(sf_data) %>%
set_units("km^2") %>%
drop_units()
output <- data.frame(
DESCRIPTION = description,
NUM_FEATURES = count(sf_data),
MEAN_AREA = mean(features_area),
STANDARD_DEV = sd(features_area),
TOTAL_AREA = sum(features_area)
)[-3]
colnames(output)[2] <- "NUM_FEATURES" # Remove ".n" from col name
return(output)
}
point_in_polygon <- function(points, polygon, compare) {
# Performs point-in-polygon computation via a given column.
#
# Args:
# points: A spatial dataset of points.
# polygon: A spatial dataset of polygons.
# compare: The column from polygons to count the number
# of points in.
#
# Returns:
# A spatial dataset of two columns counting the points in any
# given polygon.
st_join(polygon, points) %>%
st_drop_geometry() %>%
count(get(compare)) %>%
setNames(c(compare, "n")) %>%
left_join(polygon, by = compare) %>%
st_as_sf()
}
For explicit function code, click the above CODE button.
get_conus_spatial()Filter CONUS from a spatial US dataset and modify its projection.
Args:
data: Spatial US dataset to filter.
var: Column of "data" that contains states' information.
proj: EPSG code to project dataset to, defaults to EPSG:5070.
abb: If TRUE, then states in "var" are denoted with abbreviations;
If FALSE, then states in "var" are denoted by their full names.
Returns:
A spatial CONUS dataset projected to given EPSG code.
generate_tiles()Generate tiled spatial data from four tessellation/coverage options.
Args:
data: Spatial dataset to tessellate
tile_type: "square" - square coverage with "n = 70"
"hexagonal" - hexagonal coverage with "n = 70"
"voroni" - voroni tesselation
"triangulate" - Delaunay triangulation
none of the above - unmodified data
Returns:
A spatial, tiled dataset from a given dataset and
tessellation/coverage type or unmodified data.
plot_map()Plot spatial data using ggplot() + geom_sf().
Args:
data: Spatial dataset to plot.
title: Title shown on plot.
include_features: If TRUE, include caption of features collected.
Returns:
A plotted ggplot() map with, a white background,
navy border, size = 0.2, features collected, and given title.
plot_map_pip()Plot simple features point-in-polygon data using ggplot() + geom_sf().
Args:
data: simple features point-in-polygon dataset to plot.
title: Title shown on plot.
Returns:
A plotted ggplot() map using viridis colour scale.
compute_features()Compute calculations on a "sf" object.
Args:
sf_data: A simple features object.
description: A description of the data frame output.
Returns:
A data frame with "description" as the first column, and following,
the number of features, mean area of the "sf" object, standard deviation
of the area, and total area.
point_in_polygon()Performs point-in-polygon computation via a given column.
Args:
points: A spatial dataset of points.
polygon: A spatial dataset of polygons.
compare: The column from polygons to count the number
of points in.
Returns:
A spatial dataset of two columns counting the points in any
given polygon.
# Step 1.1
conus_counties <- get_conus_spatial(USAboundaries::us_counties(), "state_name")
# Step 1.2/1.3
conus_square <- generate_tiles(conus_counties, "square")
conus_hexagonal <- generate_tiles(conus_counties, "hexagonal")
conus_voroni <- generate_tiles(conus_counties, "voroni")
conus_triangulate <- generate_tiles(conus_counties, "triangulate")
# Testing before Step 1.4/1.5
conus_border <- st_union(conus_counties)
conus_simple <- conus_border %>%
rmapshaper::ms_simplify(keep = 0.05)
One thing to notice about the data that we are going to perform analysis on, in particular the tessellated spatial datasets, is that we have a lot of surrounding excess data. The goal here with tidying these datasets is to create a suitable dataset for performing analysis on, and in this case, that means specifically CONUS representative data. The first thing we will do is to create a simplified border of CONUS:
ggplot() +
geom_sf(data = conus_border) +
theme_void() +
theme_void() +
labs(subtitle = paste(
"Unioned CONUS Points: ",
mapview::npts(conus_border)
)
)
ggplot() +
geom_sf(data = conus_simple) +
theme_void() +
labs(subtitle = paste(
"Simplified CONUS Points: ",
mapview::npts(conus_simple)
)
)
By simplifying we were able to remove 3068 points, while still keeping roughly the same shape that we want. While it may seem that we do not want to lose data, for the analysis that we will be doing, a great amount of detail isn’t necessary. Here, we want to visualize the overlap of our simplified CONUS border over our tessellations, so that we can see exactly what are trying to get rid of Note that, while the overlapping CONUS border looks like it is a MULTILINESTRING, it’s really a geom_sf() with fill = NA. Now, notice the border excludes most of the excess data surrounding the tessellated spatial datasets that we do not need:
# Overlapping Plot with CONUS map to cut edges
ggplot() +
geom_sf(data = conus_voroni, fill = NA) +
geom_sf(
data = conus_simple,
color = "green",
fill = NA) +
theme_void() +
labs(subtitle = paste(
"Simplified CONUS Points: ",
mapview::npts(conus_simple),
"\n",
"Voroni Tessellation Points: ",
mapview::npts(conus_voroni)
)
)
# Overlapping Plot with CONUS map to cut edges
ggplot() +
geom_sf(data = conus_triangulate, fill = NA) +
geom_sf(
data = conus_simple,
color = "green",
fill = NA) +
theme_void() +
labs(subtitle = paste(
"Simplified CONUS Points: ",
mapview::npts(conus_simple),
"\n",
"Delaunay Triangulation Points: ",
mapview::npts(conus_triangulate)
)
)
Using sf::st_intersection(), we can get rid of the excess data. Using this function, we have the following spatial datasets:
# Plot tidied voroni
conus_voroni_tidy <- st_intersection(conus_voroni, conus_simple)
plot_map(
conus_voroni_tidy,
title = "Tidied Voroni Tessellation",
include_features = FALSE
)
# Plot tidied triangulation
conus_triangulate_tidy <- st_intersection(conus_triangulate, conus_simple)
plot_map(
conus_triangulate_tidy,
title = "Tidied Delaunay Triangulation",
include_features = FALSE
)
So, this gives us each spatial dataset that we will be using:
feature_summary <- bind_rows(
compute_features(conus_counties, "US Counties"),
compute_features(conus_square, "Square Coverage"),
compute_features(conus_hexagonal, "Hexagonal Coverage"),
compute_features(conus_voroni_tidy, "Voroni Tessellation"),
compute_features(conus_triangulate_tidy, "Delaunay Triangulation")
)
knitr::kable(
feature_summary,
col.names = c(
"Description",
"Number of Features",
"Area Mean",
"Area Standard Deviation",
"Total Area"
),
caption = "Tessellation Features Data"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = F,
position = "center",
) %>%
kableExtra::column_spec(
2,
color = kableExtra::spec_color(feature_summary$NUM_FEATURES, end = 0.75, option = "C", direction = -1)
) %>%
kableExtra::column_spec(
5,
color = kableExtra::spec_color(feature_summary$TOTAL_AREA, end = 0.75, option = "C", direction = -1)
)
| Description | Number of Features | Area Mean | Area Standard Deviation | Total Area |
|---|---|---|---|---|
| US Counties | 3108 | 2521.745 | 3404.326 | 7837583 |
| Square Coverage | 3108 | 2728.125 | 0.000 | 8479013 |
| Hexagonal Coverage | 2271 | 3763.052 | 0.000 | 8545892 |
| Voroni Tessellation | 3106 | 2525.425 | 2886.661 | 7843970 |
| Delaunay Triangulation | 6195 | 1253.691 | 1578.647 | 7766614 |
When we analyze these tessellations, some important information needs to be taken into account. From the Tessellation Features Data table, we can observe a few different attributes, such as: number of features, mean of feature areas, standard deviation of feature areas, and the total area. The importance of this data relates to the modifiable areal unit problem and inherent statistical bias; for example, consider the original dataset versus the other tessellations. When the original dataset is tessellated, there is a potential loss of accuracy with point-in-polygon analysis, as points may potentially fall into differing areas with the tessellated datasets. However, the tradeoffs for accuracy come with computation. Take hexagonal coverage versus Delaunay triangulation for example. Since hexagonal coverage has a significantly smaller number of features than Delaunay triangulation, the efficiency of computing point-in-polygon analysis would be much faster. Although, depending on the time complexity of the method used for computing the analysis, this may not be as much of an issue (i.e. \(O(n)\) versus \(O(\log(n))\)).
us_dams <- readxl::read_excel("data/NID2019_U.xlsx") %>%
filter(!is.na(LONGITUDE), !is.na(LATITUDE)) %>%
st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) %>%
st_transform(5070)
pip_counties <- point_in_polygon(us_dams, conus_counties, "geoid")
pip_square <- point_in_polygon(us_dams, conus_square, "id")
pip_hexagonal <- point_in_polygon(us_dams, conus_hexagonal, "id")
pip_voroni <- point_in_polygon(us_dams, conus_voroni_tidy, "id")
pip_delaunay <- point_in_polygon(us_dams, conus_triangulate_tidy, "id")